See https://quanteda.io for tutorials and examples.
library(leaflet)
Warning: package 'leaflet' was built under R version 4.3.3
library(tidygeocoder)
Warning: package 'tidygeocoder' was built under R version 4.3.3
# Reading the CSV Fileproduct_catalog <-read_csv("myntra_products_catalog.csv")
Rows: 12491 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): ProductName, ProductBrand, Gender, Description, PrimaryColor
dbl (3): ProductID, Price (INR), NumImages
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(product_catalog)
[1] 12491
# Pre-processing dataproduct_catalog <- product_catalog %>%# Renaming columns named weirdlyrename(PriceINR =`Price (INR)`) %>%# Removing rows with blank entries or NAs in any column####filter(apply(., 1, function(row) all(row != "" & !is.na(row)))) %>%drop_na() %>%# Trimming leading and trailing whitespaces in all character columns# Adapted from stackoverflow: https://stackoverflow.com/questions/20760547/removing-whitespace-from-a-whole-data-frame-in-rmutate(across(where(is.character), trimws))head(product_catalog)
# A tibble: 6 × 8
ProductID ProductName ProductBrand Gender PriceINR NumImages Description
<dbl> <chr> <chr> <chr> <dbl> <dbl> <chr>
1 10017413 DKNY Unisex Blac… DKNY Unisex 11745 7 Black and …
2 10016283 EthnoVogue Women… EthnoVogue Women 5810 7 Beige & Gr…
3 10009781 SPYKAR Women Pin… SPYKAR Women 899 7 Pink colou…
4 10015921 Raymond Men Blue… Raymond Men 5599 5 Blue self-…
5 10017833 Parx Men Brown &… Parx Men 759 5 Brown and …
6 10014361 SHOWOFF Men Brow… SHOWOFF Men 791 5 Brown soli…
# ℹ 1 more variable: PrimaryColor <chr>
Expensive and budget-friendly brands
# Calculating the average price for each brandbrand_prices <- product_catalog %>%group_by(ProductBrand) %>%# Adapted from stackoverflow : https://stackoverflow.com/questions/25759891/dplyr-summarise-each-with-na-rmsummarise(AveragePrice =mean(PriceINR, na.rm =TRUE),NumProducts =n()) %>%filter(NumProducts >10) %>%ungroup() %>%arrange(desc(AveragePrice))# Identifying the top 3 most expensive brands and cheap brandstop_expensive_brands <-head(brand_prices, 3)top_cheap_brands <-tail(brand_prices, 3)# Displaying the resultsprint("Top 3 Most Expensive Brands: ")
# Getting product counts grouped by gendergender_counts <- product_catalog %>%group_by(Gender) %>%summarise(Count =n()) %>%ungroup()# Creating a bar graph for the number of products by genderggplot(gender_counts, aes(x = Gender, y = Count, fill = Gender, label = Count)) +geom_bar(stat ="identity", show.legend =FALSE) +theme_minimal() +labs(title ="Gender wise product distribution",x ="Gender",y ="Number of Products") +scale_fill_brewer(palette ="Set1") +geom_text(position =position_stack(vjust =0.5))
Color wise product distribution
# Getting product counts for each colorcolor_counts <- product_catalog %>%group_by(PrimaryColor) %>%summarise(Count =n()) %>%arrange(desc(Count))nrow(color_counts)
[1] 27
# Identifying top 5 colorstop_colors <-head(color_counts, 5)# Putting the rest of the colors in "Others" categoryothers_count <-sum(tail(color_counts, n =nrow(color_counts) -5)$Count)others_row <-data.frame(PrimaryColor ="Others", Count = others_count)# Combining the top colors with "Others"final_data <-rbind(top_colors, others_row)final_data
# A tibble: 6 × 2
PrimaryColor Count
<chr> <int>
1 Blue 3443
2 Black 1640
3 Red 1543
4 Green 908
5 White 880
6 Others 3183
# setting color styles manually# Adapted from open ai : 'Prompt - Color codes for the below colors'named_colors <-c("Black"="#808080", # Lighter grey instead of black"Blue"="#ADD8E6", # Light Blue"Red"="#FF9999", # Light Red"Green"="#90EE90", # Light Green"White"="#F8F8FF", # Off-White, Ghost White"Others"="#D8BFD8"# Light Purple (Thistle))# Changing the 'final_data' values as percentage of whole countfinal_data$Percentage <- (final_data$Count /sum(final_data$Count)) *100ggplot(final_data, aes(x ="", y = Count, fill = PrimaryColor, label =sprintf("%1.1f%%", Percentage))) +geom_bar(width =1, stat ="identity", color ="white") +coord_polar(theta ="y") +theme_void() +labs(title ="Color Based Product Distribution") +scale_fill_manual(values = named_colors) +geom_text(position =position_stack(vjust =0.5))
# blue, black and red are the most popular color categories
Sentiment and Review analysis using Dataset 2 and 3
# Reading the other 2 datasets (Detailed Product dataset and Product Review dataset)Product_Detailed_2 <-read_csv("Product_Detailed_2.csv")
Rows: 526564 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): URL, BrandName, Category, Individual_category, category_by_Gender, ...
dbl (5): Product_id, DiscountPrice (in Rs), OriginalPrice (in Rs), Ratings, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 23486 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): Title, Review Text, Division Name, Category, Class Name
dbl (6): index, Clothing ID, Age, Rating, Recommended IND, Positive Feedback...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Manually creating the mapping based on domain knowledge so that the Categories from both the dataset can be matched to each otherProduct_Detailed_2 <- Product_Detailed_2 %>%mutate(Category =case_when( Category =="Inner Wear & Sleep Wear"~"Intimate", Category =="Lingerie & Sleep Wear"~"Intimate", Category =="Western"~"Dresses", Category =="Bottom Wear"~"Bottoms", Category =="Topwear"~"Tops", Category =="Sports Wear"~"Jackets", Category =="Indian Wear"~"Trend",TRUE~as.character(Category) # Default case to keep original Category values ))
Sentiment scores and average ratings
# Average rating using Detailed product dataset (Dataset 2)avg_rating_by_category <- Product_Detailed_2 %>%group_by(Category) %>%summarise(AverageRating =mean(Ratings, na.rm =TRUE)) %>%ungroup()# Sentiment scores using Product Review dataset (Dataset 3)sentiment_analysis <- Product_Reviews_3 %>%unnest_tokens(word, 'Review Text') %>%inner_join(get_sentiments("bing"), by ="word", relationship ="many-to-many") %>%group_by(index) %>%summarise(sentiment_score =sum(case_when( sentiment =="positive"~1, sentiment =="negative"~-1,TRUE~0L )), .groups ='drop')# plotting sentiment score histogramggplot(sentiment_analysis, aes(x = sentiment_score)) +geom_histogram(bins =50) +labs(title ="Distribution of Sentiment Scores", x ="Sentiment Score", y ="Count")
# Joining the sentiments back to the review datasetreviewdataset_with_sentiments <- Product_Reviews_3 %>%left_join(sentiment_analysis, by ="index")# Grouping by Category for analysisavg_sentiments_by_category <- reviewdataset_with_sentiments %>%group_by(Category) %>%summarise(average_sentiment =mean(sentiment_score, na.rm =TRUE)) %>%ungroup() # Visualizing sentiment scores across categoriesavg_sentiments_by_category <-na.omit(avg_sentiments_by_category)p <-plot_ly(data = avg_sentiments_by_category, x =~Category, y =~average_sentiment, type ='bar', marker =list(color =~average_sentiment, colorscale ='Viridis'), showlegend =FALSE) %>%layout(title ='Distribution of Average Sentiment Scores by Category',xaxis =list(title ='Category'),yaxis =list(title ='Average Sentiment Score'))p
The histogram shows a normal distribution for the sentiment scores, most scores cluster around the middle, suggesting that most sentiments expressed are neutral. There are fewer very positive or very negative sentiments, as seen by the lower bars at the ends.
This bar chart displays average sentiment scores for different clothing categories. Tops, Dresses, and Jackets seem to have higher average sentiments, indicating more positive feedback, while the ‘Trend’ category has a noticeably lower average sentiment score.
Insights using Sentiment score and ratings
# Joining the datasets on the 'Category' column and displaying the resultscategory_insights <-inner_join(avg_rating_by_category, avg_sentiments_by_category, by ="Category")final_display <- category_insights %>%select(Category, AverageRating, average_sentiment)print(final_display)
# The bar chartp <-plot_ly(data = final_display, x =~Category) %>%add_bars(y =~AverageRating, name ='Average Rating', marker =list(color ='#ADD8E6')) %>%add_bars(y =~average_sentiment, name ='Average Sentiment', marker =list(color ='#FF9999')) %>%layout(yaxis =list(title ='Score'),barmode ='group',title ='Comparison of Average Ratings and Sentiments by Category')p
The bar chart shows comparison of average ratings and average sentiments across various categories such as “Bottoms,” “Dresses,” “Intimate,” and so on.
Insights from the chart:
Consistency: The average ratings and average sentiments are consistent across categories. This suggests that customers’ overall satisfaction levels are similar across different product types. Moreover the balance indicates that sentiment scores derived from reviews correlate well with the numerical ratings customers give. In other words, how customers speak about the product aligns with how they rate it.
Lack of Extremes and positiveness: There are no categories with extreme sentiment or rating scores. The positive ratings and sentiment values also indicate a moderately positive level of customer feeling towards the products in each categories.
Wordcloud of Category with high sentiment
# Filtering the dataset for the category "Tops"dresses_reviews <- Product_Reviews_3 %>%filter(Category =="Tops") %>%select('Review Text')# Creating a text corpus from the vector source of review texttext_corpus <-corpus(dresses_reviews$'Review Text')
Warning: NA is replaced by empty string
# Preprocessing to remove punctuations, numbers, spaces and common english words like 'the', 'at' since they are not relevant for analysiscleaned_corpus <- text_corpus %>%tokens() %>%tokens_tolower() %>%tokens_remove(pattern ="[[:punct:]]") %>%tokens_remove(pattern ="\\d+") %>%tokens_remove(pattern =stopwords("en")) %>%tokens_remove(pattern ="[[:space:]]+")# Adapted from: https://search.r-project.org/CRAN/refmans/quanteda/html/tokens_select.html# Generate the word cloudset.seed(1234)wordcloud(words = cleaned_corpus, scale =c(3, 0.5), max.words =100, random.order =FALSE, colors =brewer.pal(8, "Dark2"))
Loading required namespace: tm
The word cloud for “tops” highlights key words that stand out in customer reviews. “Love,” “like,” and “fit” are prominent, indicating these are common sentiments that customers frequently mention. Words like “color,” “size,” and “fabric” suggest these are important factors for customers when evaluating tops. Overall, this word cloud reflects positive customer experiences.
Analysis of Brand Average Ratings in Relation to Pricing and Product Count
# Building and summarizing a glm modelglm_model <-glm(AvgRating ~ AvgPrice1 + ProductCount, family =gaussian(), data = combined_data)summary(glm_model)
Call:
glm(formula = AvgRating ~ AvgPrice1 + ProductCount, family = gaussian(),
data = combined_data)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.065e+00 5.395e-02 75.345 <2e-16 ***
AvgPrice1 3.787e-05 3.311e-05 1.144 0.260
ProductCount 1.234e-04 1.456e-04 0.847 0.403
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 0.02117982)
Null deviance: 0.78344 on 37 degrees of freedom
Residual deviance: 0.74129 on 35 degrees of freedom
AIC: -33.765
Number of Fisher Scoring iterations: 2
A Generalized Linear Model (GLM) explores the relationships between the average price and the product count on the average rating for brands with more than 50 products.
The GLM analysis, focusing on the average price and the product count as predictors of average rating, reveals that neither average price nor product count significantly affects the average ratings in a statistically meaningful way.
# Adding predicted ratingscombined_data$predicted_ratings <-predict(glm_model, type ="response")# Ploting the ratings dataggplot(combined_data, aes(x = AvgRating, y = predicted_ratings)) +geom_point(aes(color ="Actual vs. Predicted"), alpha =0.6) +geom_abline(intercept =0, slope =1, linetype ="dashed", color ="red") +labs(title ="Actual vs. Predicted Ratings",x ="Actual Ratings",y ="Predicted Ratings") +theme_minimal() +scale_color_manual(values =c("Actual vs. Predicted"="blue"))
While the statistical model did not reveal significant predictors for average ratings, the visualization of actual versus predicted ratings is presented here to show the model’s power.
We can see that most points are clustered around the line, but there is a spread, indicating some variance between the predicted and actual ratings.
Average price changes over time
We are using Top 5 categories from the ‘brand_prices’ variable generated in ‘Expensive and budget-friendly brands’ section that uses product dataset (dataset 1) that has products from 2015 and also generating top 5 brands from the detailed dataset (dataset 2) that has products from 2020 for brands that have more than 10 products and then comparing the prices over time.
# Aggregating brand data using Dataset 1 - product_catalogtop5_expensive_brands_2015 <-head(brand_prices, 5) %>%mutate(Year ="2015") %>%select(Brand = ProductBrand, AvgPrice=AveragePrice, Year)# Aggregating brand data using Dataset 2 - Product_Detailed_2top5_expensive_brands_2020 <- Product_Detailed_2 %>%group_by(BrandName) %>%filter(n() >10) %>%summarise(AvgPrice =mean(`OriginalPrice (in Rs)`, na.rm =TRUE) ) %>%ungroup() %>%mutate(Year ="2020") %>%arrange(desc(AvgPrice)) %>%head(5) %>%select(Brand = BrandName, AvgPrice, Year)# Binding the two while keeping the year distinctiontop_brands_combined <-bind_rows(top5_expensive_brands_2015, top5_expensive_brands_2020)top_brands_combined
# Arranging the dataframe by AvgPrice in descending ordertop_brands_combined <- top_brands_combined %>%mutate(Brand =factor(Brand, levels =unique(Brand[order(-AvgPrice)])))# Creating the plot and also displaying top 5 expensive brands from 2015 and 2020df_2015 <- top_brands_combined %>%filter(Year =="2015")df_2020 <- top_brands_combined %>%filter(Year =="2020")print("Top 5 Expensive brands 2015:")
# A tibble: 5 × 3
Brand AvgPrice Year
<fct> <dbl> <chr>
1 Teakwood Leathers 20832. 2020
2 WHITE FIRE 17104. 2020
3 Masaba 16952. 2020
4 DRESSTIVE 16558. 2020
5 KARAGIRI 15166. 2020
# Creating the bar chart for 2015p1 <-plot_ly(data = df_2015, x =~Brand, y =~AvgPrice, type ='bar', name ='2015',marker =list(color ='coral')) %>%layout(yaxis =list(title ='Average Price 2015 (INR)'))# Creating the bar chart for 2020p2 <-plot_ly(data = df_2020, x =~Brand, y =~AvgPrice, type ='bar', name ='2020',marker =list(color ='seagreen')) %>%layout(yaxis =list(title ='Average Price 2020 (INR)'),xaxis =list(title ='Brand'))# Combining the two plots into one with two subplots# Note: shareX = TRUE means that all subplots will share the same x-axis.subplot(p1, p2, nrows =2, shareX =TRUE) %>%layout(title ='Average Prices by Brand for 2015 and 2020')
# Adapted from: https://plotly.com/r/subplots/
This output shows a side-by-side comparison of average prices for various brands in two different years, 2015 and 2020. The orange bars represent the average prices in 2015, while the green bars represent the prices in 2020. The chart shows that, in general, brands are charging more for their products in 2020 than they did in 2015, prices have gone up in five years. The top brands have also changed like ‘Teakwood’, ‘White Fire’, however brands like ‘Seiko’ continue to dominate the expensive segment.
Brand location on world map using leaflet
# Loading the dataset that contains brands along with their founding countrylocation_of_brands <-read.csv("Location_of_brands.csv")head(location_of_brands)
# Finding lat and long using OSM geocodinggeocoded_data <- location_of_brands %>%geocode(country = location, method ='osm') %>%# Adapted from: https://geocoder.readthedocs.io/providers/OpenStreetMap.htmldistinct(location, .keep_all =TRUE) %>%select(location,lat,long)
Passing 10 addresses to the Nominatim single address geocoder
Query completed in: 10.2 seconds
print(geocoded_data)
# A tibble: 10 × 3
location lat long
<chr> <dbl> <dbl>
1 Germany 51.2 10.4
2 Denmark 55.7 10.3
3 India 22.4 78.7
4 Japan 36.6 139.
5 China 35.0 105.
6 Italy 42.6 12.7
7 Canada 61.1 -108.
8 France 46.6 1.89
9 Spain 39.3 -4.84
10 USA 39.8 -100.
# Joining the lat and long with the brands with location datasetbrand_location_geocoded <-inner_join(location_of_brands, geocoded_data, by ="location", relationship ="many-to-many") %>%select (Brands,location,lat,long)head(brand_location_geocoded)
Brands location lat long
1 Priyaasi Germany 51.16382 10.44783
2 AMMARZO Denmark 55.67025 10.33333
3 John Players Denmark 55.67025 10.33333
4 Nejo India 22.35111 78.66774
5 RAREISM Denmark 55.67025 10.33333
6 Mojama Germany 51.16382 10.44783
# Aggregating the data from detailed dataset 2 at brand levelaggregated_brands <- Product_Detailed_2 %>%group_by(BrandName) %>%summarise(AvgPrice =mean(`OriginalPrice (in Rs)`, na.rm =TRUE),ProductCount =n() ) %>%ungroup() %>%select(Brands = BrandName, AvgPrice, ProductCount)# Using the Brand aggregated dataset and brand geocoded dataset to create a list of top 3 brands for each country and product count for those brandstop_brands_by_country <- aggregated_brands %>%inner_join(brand_location_geocoded, by ="Brands") %>%group_by(location) %>%arrange(desc(AvgPrice)) %>%mutate(Rank =row_number()) %>%filter(Rank <=3) %>%summarise(TopBrands =paste(Brands, "(", round(AvgPrice, 2), "INR - ", ProductCount, "products)", collapse ="<br>") ) %>%ungroup()# Joining back with 'geocoded_data' to get latitude and longitudecountry_info <- geocoded_data %>%distinct(location, .keep_all =TRUE) %>%inner_join(top_brands_by_country, by ="location")print(country_info)